In [1]:
    
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from load_utils import *
    
In [2]:
    
d = load_diffs(keep_diff = True)
df_events, df_blocked_user_text = load_block_events_and_users()
    
In [3]:
    
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(figsize=(12, 6))
xa = np.sort(d['annotated']['recipient_score'])
ya = 1. * np.arange(len(xa)) / (len(xa) - 1)
xm = np.sort(d['annotated']['pred_recipient_score'])
ym = 1. * np.arange(len(xm)) / (len(xm) - 1)
ax1 = fig.add_subplot(121)
ax1.plot(xa, ya, label = 'annotators')
ax1.plot(xm, ym, label = 'model')
ax1.set_xlabel('$P(X<x)$')
ax1.set_ylabel('$p$')
ax1.legend()
ax2 = fig.add_subplot(122)
ax2.plot(xa, ya, label = 'annotators')
ax2.plot(xm, ym, label = 'model')
ax2.set_xlabel('$P(X<x)$')
ax2.set_ylabel('$p$')
ax2.set_ylim((0.90, 1))
ax2.legend()
    
    Out[3]:
    
Model does not assign 0 scores, like the annotators. Model score distribution is skewed left for x > 0.2.
In [4]:
    
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(figsize=(12, 6))
xa = np.sort(d['annotated']['recipient_score'])
ya = 100 * (1 - (1. * np.arange(len(xa)) / (len(xa) - 1)))
xm = np.sort(d['annotated']['pred_recipient_score'])
ym = 100 * (1 - (1. * np.arange(len(xm)) / (len(xm) - 1)))
ax1 = fig.add_subplot(121)
ax1.plot(xa, ya, label = 'annotators')
ax1.plot(xm, ym, label = 'model')
ax1.set_xlabel('$t$')
ax1.set_ylabel('Percent of comments that are attacks')
ax1.legend()
ax2 = fig.add_subplot(122)
ax2.plot(xa, ya, label = 'annotators')
ax2.plot(xm, ym, label = 'model')
ax2.set_xlabel('$t$')
ax2.set_ylabel('Percent of comments above threshold t')
ax2.set_ylim((0.0, 3))
ax2.set_xlim((0.2, 1.0))
ax2.legend()
plt.savefig('../../paper/figs/model_vs_annotator_percent_attack_distributions.png')
    
    
In [5]:
    
def check_range(d, col, min, max, n = 10, ):
    df = d.query('%s >= %f and %s <= %f' % (col, min, col, max))
    for i, r in df.sample(n).iterrows():
        print(r[col])
        print(r.clean_diff)
        print('\n')
    
In [7]:
    
check_range(d['sample'], 'pred_aggression_score',0.6, 0.7)
    
    
In [7]:
    
# annotations
sns.distplot(d['annotated']['aggression_score'].dropna(), hist=False, label = 'annotator scores')
# model on annotated data
sns.distplot(d['annotated']['pred_aggression_score'].dropna(), hist=False, label = 'model on annoted data')
#model no admin sample
sns.distplot(d['sample']['pred_aggression_score'].dropna(), hist=False, label = 'model sample')
plt.xlim(-1, 1)
plt.legend()
    
    Out[7]:
    
In [8]:
    
sns.distplot(d['annotated']['recipient_score'].dropna(), kde =False, norm_hist = True)
    
    Out[8]:
    
In [9]:
    
sns.distplot(d['annotated']['pred_recipient_score'].dropna(), kde =False, norm_hist = True)
sns.distplot(d['sample']['pred_recipient_score'].dropna(), kde =False, norm_hist = True)
    
    Out[9]:
    
In [ ]: